About

Project: FL100

About: Preproccess the data by determining the best prevalence thresholds to use. Run “standard” microbiome analysis focusing on plasma TMAO. This includes alpha and beta diversity, ANCOM differential abundance, and DESeq2 differential abundance analyses.

Inputs: Inputs include:

  1. TMAO phyloseq object
  2. Master phenotype data

Outputs: Outputs include:

  1. Plots per section [expand as analysis goes final]
  2. Tables [expand as analysis goes final]

Sources:

  1. Example using Negative Binomial in Microbiome Differential Abundance Testing
  2. Workflow for Microbiome Data Analysis: from raw reads to community analyses

Set Up

library(DESeq2)
library(phyloseq)
library(ggplot2)
library(dplyr)
library(vegan)
load("../../data/processed/microbiome/phyloseq_objects/PSOtmao_211108.RData") #PSOtmao

Preproccess

Table of read counts at the Phylum level. We will filter low abundant Phyla. We will also require that taxa are present in at least 20% of samples (at least 71 samples) and occur at least 1000 times. We will then agglomerate at the Genus level.

print("The number of different bacteria at each phyla:")
## [1] "The number of different bacteria at each phyla:"
table(tax_table(PSOtmao)[, "Phylum"])
## 
##   p__Actinobacteria    p__Bacteroidetes    p__Cyanobacteria    p__Elusimicrobia 
##                  45                 575                   8                   1 
##       p__Firmicutes     p__Fusobacteria   p__Proteobacteria     p__Spirochaetes 
##                2042                   4                  44                   1 
##    p__Synergistetes      p__Tenericutes  p__Verrucomicrobia 
##                   1                  60                   5
# Prevalence - how many samples the taxa is observed in
prevdf = apply(X = otu_table(PSOtmao),
               MARGIN = ifelse(taxa_are_rows(PSOtmao), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})

prevdf = data.frame(Prevalence = prevdf,
                    TotalAbundance = taxa_sums(PSOtmao), # the total number of occurrences (even if only observed 50 times in 1 sample, the number would be 50)
                    tax_table(PSOtmao))

print("Table showing prevalence, total abundance, and taxonomy:")
## [1] "Table showing prevalence, total abundance, and taxonomy:"
prevdf[1:20,]
##                                  Prevalence TotalAbundance     Kingdom
## 77a72e43b8d0143ee2d29c79be1b00da         25           7590 k__Bacteria
## 909ae41a723a8fd891fe419d479eedee          1             67 k__Bacteria
## ede1221982486d3a7b845b3d9b52d143          3            612 k__Bacteria
## 20f39b86c19b9d76d340da961091b85c          2            127 k__Bacteria
## a735c8388a8121dd152ff23c4542b704          1            153 k__Bacteria
## 22138725bb36874fd5109e9a0070f2f8          2             93 k__Bacteria
## 8bf97afdd2f9c12017be818206946c22          1             48 k__Bacteria
## 88efe73ef81626a8a150533dea375e82          1            151 k__Bacteria
## 54fe5a27808d4d09c806b4c562a27601          2             59 k__Bacteria
## 2b0390d3c06b1bbe94ec48f3b036a500          1             52 k__Bacteria
## aed8608657afe922ec14430de7b48f3e          1            198 k__Bacteria
## 89d1dade4f693199e3aed179f37508d1          1             87 k__Bacteria
## 6e6f0f914f6175cd071197f97b9672c4         43            558 k__Bacteria
## bf1343d2ee6d94673c0907bc566ea9d9         76            858 k__Bacteria
## f1d7ac8c18c1d8144a00e4d785c86e4e         47            529 k__Bacteria
## 484ead003b64fe20c520597fa7797d23         34           1128 k__Bacteria
## 344bc48fbf2d6572954c51cdf104061b        118           2177 k__Bacteria
## 3520cbb42eb18c7d0889d4acd1fb17c1          1             41 k__Bacteria
## 5adf9526730ed772ee3ef6cd39caeee6          2             81 k__Bacteria
## f28bcb82096cccf4d2334f12b76c3d8e        126           4355 k__Bacteria
##                                             Phylum           Class
## 77a72e43b8d0143ee2d29c79be1b00da  p__Bacteroidetes  c__Bacteroidia
## 909ae41a723a8fd891fe419d479eedee  p__Bacteroidetes  c__Bacteroidia
## ede1221982486d3a7b845b3d9b52d143  p__Bacteroidetes  c__Bacteroidia
## 20f39b86c19b9d76d340da961091b85c  p__Bacteroidetes  c__Bacteroidia
## a735c8388a8121dd152ff23c4542b704  p__Bacteroidetes  c__Bacteroidia
## 22138725bb36874fd5109e9a0070f2f8  p__Bacteroidetes  c__Bacteroidia
## 8bf97afdd2f9c12017be818206946c22  p__Bacteroidetes  c__Bacteroidia
## 88efe73ef81626a8a150533dea375e82  p__Bacteroidetes  c__Bacteroidia
## 54fe5a27808d4d09c806b4c562a27601  p__Bacteroidetes  c__Bacteroidia
## 2b0390d3c06b1bbe94ec48f3b036a500  p__Bacteroidetes  c__Bacteroidia
## aed8608657afe922ec14430de7b48f3e  p__Bacteroidetes  c__Bacteroidia
## 89d1dade4f693199e3aed179f37508d1  p__Bacteroidetes  c__Bacteroidia
## 6e6f0f914f6175cd071197f97b9672c4  p__Bacteroidetes  c__Bacteroidia
## bf1343d2ee6d94673c0907bc566ea9d9  p__Bacteroidetes  c__Bacteroidia
## f1d7ac8c18c1d8144a00e4d785c86e4e  p__Bacteroidetes  c__Bacteroidia
## 484ead003b64fe20c520597fa7797d23  p__Bacteroidetes  c__Bacteroidia
## 344bc48fbf2d6572954c51cdf104061b  p__Bacteroidetes  c__Bacteroidia
## 3520cbb42eb18c7d0889d4acd1fb17c1  p__Bacteroidetes  c__Bacteroidia
## 5adf9526730ed772ee3ef6cd39caeee6  p__Bacteroidetes  c__Bacteroidia
## f28bcb82096cccf4d2334f12b76c3d8e  p__Bacteroidetes  c__Bacteroidia
##                                              Order             Family
## 77a72e43b8d0143ee2d29c79be1b00da  o__Bacteroidales  f__Prevotellaceae
## 909ae41a723a8fd891fe419d479eedee  o__Bacteroidales  f__Prevotellaceae
## ede1221982486d3a7b845b3d9b52d143  o__Bacteroidales  f__Prevotellaceae
## 20f39b86c19b9d76d340da961091b85c  o__Bacteroidales                f__
## a735c8388a8121dd152ff23c4542b704  o__Bacteroidales                f__
## 22138725bb36874fd5109e9a0070f2f8  o__Bacteroidales                f__
## 8bf97afdd2f9c12017be818206946c22  o__Bacteroidales                f__
## 88efe73ef81626a8a150533dea375e82  o__Bacteroidales                f__
## 54fe5a27808d4d09c806b4c562a27601  o__Bacteroidales                f__
## 2b0390d3c06b1bbe94ec48f3b036a500  o__Bacteroidales                f__
## aed8608657afe922ec14430de7b48f3e  o__Bacteroidales                f__
## 89d1dade4f693199e3aed179f37508d1  o__Bacteroidales   f__Rikenellaceae
## 6e6f0f914f6175cd071197f97b9672c4  o__Bacteroidales   f__Rikenellaceae
## bf1343d2ee6d94673c0907bc566ea9d9  o__Bacteroidales   f__Rikenellaceae
## f1d7ac8c18c1d8144a00e4d785c86e4e  o__Bacteroidales   f__Rikenellaceae
## 484ead003b64fe20c520597fa7797d23  o__Bacteroidales   f__Rikenellaceae
## 344bc48fbf2d6572954c51cdf104061b  o__Bacteroidales   f__Rikenellaceae
## 3520cbb42eb18c7d0889d4acd1fb17c1  o__Bacteroidales   f__Rikenellaceae
## 5adf9526730ed772ee3ef6cd39caeee6  o__Bacteroidales   f__Rikenellaceae
## f28bcb82096cccf4d2334f12b76c3d8e  o__Bacteroidales   f__Rikenellaceae
##                                          Genus          Species
## 77a72e43b8d0143ee2d29c79be1b00da           g__              s__
## 909ae41a723a8fd891fe419d479eedee           g__              s__
## ede1221982486d3a7b845b3d9b52d143           g__              s__
## 20f39b86c19b9d76d340da961091b85c           g__              s__
## a735c8388a8121dd152ff23c4542b704           g__              s__
## 22138725bb36874fd5109e9a0070f2f8           g__              s__
## 8bf97afdd2f9c12017be818206946c22           g__              s__
## 88efe73ef81626a8a150533dea375e82           g__              s__
## 54fe5a27808d4d09c806b4c562a27601           g__              s__
## 2b0390d3c06b1bbe94ec48f3b036a500           g__              s__
## aed8608657afe922ec14430de7b48f3e           g__              s__
## 89d1dade4f693199e3aed179f37508d1  g__Rikenella              s__
## 6e6f0f914f6175cd071197f97b9672c4           g__              s__
## bf1343d2ee6d94673c0907bc566ea9d9           g__              s__
## f1d7ac8c18c1d8144a00e4d785c86e4e  g__Alistipes  s__indistinctus
## 484ead003b64fe20c520597fa7797d23           g__              s__
## 344bc48fbf2d6572954c51cdf104061b           g__              s__
## 3520cbb42eb18c7d0889d4acd1fb17c1           g__              s__
## 5adf9526730ed772ee3ef6cd39caeee6  g__Alistipes    s__finegoldii
## f28bcb82096cccf4d2334f12b76c3d8e          <NA>             <NA>
# Make table with mean prevalence, total prevalence
print("Table showing mean prevalencce and total prevalence per Phylum:")
## [1] "Table showing mean prevalencce and total prevalence per Phylum:"
plyr::ddply(prevdf, "Phylum", function(df_mb){cbind("Mean" = mean(df_mb$Prevalence),"Sum" = sum(df_mb$Prevalence))})
##                 Phylum      Mean   Sum
## 1    p__Actinobacteria 32.733333  1473
## 2     p__Bacteroidetes 12.624348  7259
## 3     p__Cyanobacteria  5.125000    41
## 4     p__Elusimicrobia  2.000000     2
## 5        p__Firmicutes 20.987757 42857
## 6      p__Fusobacteria  3.250000    13
## 7    p__Proteobacteria 21.159091   931
## 8      p__Spirochaetes  1.000000     1
## 9     p__Synergistetes 23.000000    23
## 10      p__Tenericutes  5.966667   358
## 11  p__Verrucomicrobia 35.000000   175

From the phylum table, we can see that p__Elusimicrobia, p__Spirochaetes, and p__Synergistetes have the same mean and sum, meaning they are only present in 1 sample. We are interested and confident in the more abundant taxa so manually filterr them out.

# Phylum that only occur 1 time may be artifcats. We can remove them.
filterPhyla <- c(" p__Elusimicrobia", " p__Spirochaetes", " p__Synergistetes") # note, our taxa table has these awful hidden spaces!

# Subset
PSOtmao1 = subset_taxa(PSOtmao, !Phylum %in% filterPhyla)

# View
cat("The Phyla present after this step of subsetting includes:", "\n")
## The Phyla present after this step of subsetting includes:
table(tax_table(PSOtmao1)[, "Phylum"])
## 
##   p__Actinobacteria    p__Bacteroidetes    p__Cyanobacteria       p__Firmicutes 
##                  45                 575                   8                2042 
##     p__Fusobacteria   p__Proteobacteria      p__Tenericutes  p__Verrucomicrobia 
##                   4                  44                  60                   5
# Subset to the remaining phyla
prevdf1 = subset(prevdf, Phylum %in% get_taxa_unique(PSOtmao1, "Phylum"))

# Look at phylum level distributions
ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(PSOtmao1),color=Phylum)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +  # Include a guess for parameter
  geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  
  xlab("Total Abundance") + 
  ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) + 
  theme(legend.position="none")

# Define prevalence threshold as 20% of total samples
prevalenceThreshold = 0.20 * nsamples(PSOtmao1)
cat("The 20% prevalence threshold is", prevalenceThreshold, "samples.", "\n")
## The 20% prevalence threshold is 71 samples.
# Execute prevalence filter, using `prune_taxa()` function
keepTaxa = rownames(prevdf1)[(prevdf1$Prevalence >= prevalenceThreshold)]
PSOtmao2 = prune_taxa(keepTaxa, PSOtmao1)

cat("Which taxa have less than 1000 counts accross all samples?", "\n")
## Which taxa have less than 1000 counts accross all samples?
which(!rowSums(otu_table(PSOtmao2)) > 1000) # 6 OTUS
## bf1343d2ee6d94673c0907bc566ea9d9 7017e1746ae742d69c50c1274cc2f02e 
##                                1                               34 
## ec49f343bf1d15df825df62235425929 6e22a1b75a66f511e1477e5777e4a1a8 
##                               45                               54 
## 5d4ff9cbbb244d61483b7c3973bdced6 3b6fc350ab79c39c956b690291226a9c 
##                               57                               58 
## 0be4018b5c0b3ee2a7d41d41678ad91e 408f2b979963b917008f80b28267a350 
##                               84                               89
# Remove these to see if that cleans ordination
PSOtmao3 <- prune_taxa(rowSums(otu_table(PSOtmao2)) >= 1000, PSOtmao2)

# Agglomerate taxa
PSOtmao4 = tax_glom(PSOtmao3, "Genus", NArm = TRUE)

Great. Do your due diligence and inspect each phyloseq object to see how the filtering and glomming steps changed the nubmer of taxonomy.

# Inspect the effect of each filtering step on the phyloseq object
PSOtmao1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2783 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 2783 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2783 tips and 2727 internal nodes ]
PSOtmao2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 225 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 225 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 225 tips and 220 internal nodes ]
PSOtmao3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 217 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 217 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 217 tips and 212 internal nodes ]
PSOtmao4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 43 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 43 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 43 tips and 42 internal nodes ]

Look at Genus level distributions of the bugs.

# Prevalence - how many samples the taxa is observed in
prevdf = apply(X = otu_table(PSOtmao4),
               MARGIN = ifelse(taxa_are_rows(PSOtmao4), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})

prevdf = data.frame(Prevalence = prevdf,
                    TotalAbundance = taxa_sums(PSOtmao4), # the total number of occurrences (even if only observed 50 times in 1 sample, the number would be 50)
                    tax_table(PSOtmao4))

print("Table showing prevalence, total abundance, and taxonomy:")
## [1] "Table showing prevalence, total abundance, and taxonomy:"
prevdf[1:10,]
##                                  Prevalence TotalAbundance     Kingdom
## 7958dd98cfbbdc096240cd1dc4531423        175           5752 k__Bacteria
## 05404c9fdf9f3f334eb618bac3f434f6        228           4099 k__Bacteria
## 06105df60508c2ed24a54f1b8ed64e49        302          36587 k__Bacteria
## bb1b75f41ff9c9db1d1de41e8388eb52        355         548518 k__Bacteria
## 098c3bbd8234f4ac198297ac0bde957d        146         400319 k__Bacteria
## db8f48a3fe2fca95fb4986f5507b9076        293          48857 k__Bacteria
## 3f5533292a655d0f54a64560d65e823b        271          46075 k__Bacteria
## 803eb52cfe3d77bdf9fe14c011e425fb         98           1739 k__Bacteria
## a4bb7f5133099cd74c3817b8d3383326        189           3301 k__Bacteria
## a41766e0533ab64a9966f362a7938478        182           5737 k__Bacteria
##                                              Phylum                  Class
## 7958dd98cfbbdc096240cd1dc4531423   p__Bacteroidetes         c__Bacteroidia
## 05404c9fdf9f3f334eb618bac3f434f6   p__Bacteroidetes         c__Bacteroidia
## 06105df60508c2ed24a54f1b8ed64e49   p__Bacteroidetes         c__Bacteroidia
## bb1b75f41ff9c9db1d1de41e8388eb52   p__Bacteroidetes         c__Bacteroidia
## 098c3bbd8234f4ac198297ac0bde957d   p__Bacteroidetes         c__Bacteroidia
## db8f48a3fe2fca95fb4986f5507b9076  p__Actinobacteria      c__Actinobacteria
## 3f5533292a655d0f54a64560d65e823b  p__Actinobacteria      c__Coriobacteriia
## 803eb52cfe3d77bdf9fe14c011e425fb  p__Actinobacteria      c__Coriobacteriia
## a4bb7f5133099cd74c3817b8d3383326  p__Actinobacteria      c__Coriobacteriia
## a41766e0533ab64a9966f362a7938478  p__Proteobacteria  c__Betaproteobacteria
##                                                  Order                 Family
## 7958dd98cfbbdc096240cd1dc4531423      o__Bacteroidales       f__Rikenellaceae
## 05404c9fdf9f3f334eb618bac3f434f6      o__Bacteroidales  f__[Odoribacteraceae]
## 06105df60508c2ed24a54f1b8ed64e49      o__Bacteroidales  f__Porphyromonadaceae
## bb1b75f41ff9c9db1d1de41e8388eb52      o__Bacteroidales      f__Bacteroidaceae
## 098c3bbd8234f4ac198297ac0bde957d      o__Bacteroidales      f__Prevotellaceae
## db8f48a3fe2fca95fb4986f5507b9076  o__Bifidobacteriales  f__Bifidobacteriaceae
## 3f5533292a655d0f54a64560d65e823b   o__Coriobacteriales   f__Coriobacteriaceae
## 803eb52cfe3d77bdf9fe14c011e425fb   o__Coriobacteriales   f__Coriobacteriaceae
## a4bb7f5133099cd74c3817b8d3383326   o__Coriobacteriales   f__Coriobacteriaceae
## a41766e0533ab64a9966f362a7938478    o__Burkholderiales      f__Alcaligenaceae
##                                                Genus Species
## 7958dd98cfbbdc096240cd1dc4531423                 g__    <NA>
## 05404c9fdf9f3f334eb618bac3f434f6      g__Odoribacter    <NA>
## 06105df60508c2ed24a54f1b8ed64e49  g__Parabacteroides    <NA>
## bb1b75f41ff9c9db1d1de41e8388eb52      g__Bacteroides    <NA>
## 098c3bbd8234f4ac198297ac0bde957d       g__Prevotella    <NA>
## db8f48a3fe2fca95fb4986f5507b9076  g__Bifidobacterium    <NA>
## 3f5533292a655d0f54a64560d65e823b      g__Collinsella    <NA>
## 803eb52cfe3d77bdf9fe14c011e425fb      g__Eggerthella    <NA>
## a4bb7f5133099cd74c3817b8d3383326                 g__    <NA>
## a41766e0533ab64a9966f362a7938478       g__Sutterella    <NA>
# Family
ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(PSOtmao4),color=Family)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + # Include a guess for parameter
  geom_point(size = 1, alpha = 0.7) +
  scale_x_log10() +
  xlab("Total Abundance") +
  ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Family) +
  theme(legend.position="none")

# Genus - have to blow it up to make it useful
ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(PSOtmao4),color=Genus)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) + # Include a guess for parameter
  geom_point(size = 1, alpha = 0.7) +
  scale_x_log10() +
  xlab("Total Abundance") +
  ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Genus) +
  theme(legend.position="none")

Look for Outliers Using PCoA

We want to identify outlier taxa that might drive the statistical analyses. One way to do this is to look for outliers in the PCoA plot.

PCoA_BC <- ordinate(PSOtmao4, "PCoA", "bray")

# BC
# taxa is similar to species
taxa_plot <- plot_ordination(PSOtmao4, PCoA_BC, type = "taxa")
taxa_plot

# It looks like there is an outlier in the top left corner of the plot. 
# Identify which taxa that bacteria is from and remove it. 
tp_data <- taxa_plot$data
plot(tp_data$Axis.2) # yes an outlier!

# Who is it?
outlier <- tp_data[tp_data$Axis.2 == max(tp_data$Axis.2), ] 
cat("Which genus is the outlier?", outlier$Genus, "\n")
## Which genus is the outlier?  g__Prevotella
# Remove it
filterGenus <- c(" g__Prevotella") # note, our taxa table has these awful hidden spaces!

# Subset
PSOtmao5 = subset_taxa(PSOtmao4, !Genus %in% filterGenus)

# Replot:
PCoA_BC2 <- ordinate(PSOtmao5, "PCoA", "bray")

# BC Round 2
# taxa is similar to species
taxa_plot2 <- plot_ordination(PSOtmao5, PCoA_BC2, type = "taxa")
taxa_plot2

# samples is similar to sites
plot_ordination(PSOtmao5, PCoA_BC2, type = "samples", color = "age") 

# Which taxa have less than 1000 counts?
which(!rowSums(otu_table(PSOtmao5)) > 1000) # none, good
## named integer(0)

View the phyloseq objects again. We see that the most dramatic effect was when we filtered for phyla present in 20% of samples (PSOtmao1 to 2).

PSOtmao
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2786 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 2786 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2786 tips and 2730 internal nodes ]
PSOtmao1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2783 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 2783 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2783 tips and 2727 internal nodes ]
PSOtmao2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 225 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 225 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 225 tips and 220 internal nodes ]
PSOtmao3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 217 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 217 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 217 tips and 212 internal nodes ]
PSOtmao4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 43 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 43 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 43 tips and 42 internal nodes ]
PSOtmao5 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 42 taxa and 355 samples ]
## sample_data() Sample Data:       [ 355 samples by 60 sample variables ]
## tax_table()   Taxonomy Table:    [ 42 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 42 tips and 41 internal nodes ]

Add age and bmi categories to the “final” phyloseq object. This will come in handy when building the DESeq2 analysis and downstream plotting.

# Add age_cat
sample_data(PSOtmao5)$age_cat <- ifelse(sample_data(PSOtmao5)$age < 34, 1,
                         ifelse(sample_data(PSOtmao5)$age >= 34 & sample_data(PSOtmao5)$age < 50, 2, 3))
sample_data(PSOtmao5)$age_cat <- as.factor(sample_data(PSOtmao5)$age_cat)
# Add bmi_cat
sample_data(PSOtmao5)$bmi_cat <- ifelse(sample_data(PSOtmao5)$bmi_final.x < 25, 1,
                         ifelse(sample_data(PSOtmao5)$bmi_final.x >= 25 & sample_data(PSOtmao5)$bmi_final.x < 30, 2, 3))
sample_data(PSOtmao5)$bmi_cat <- as.factor(sample_data(PSOtmao5)$bmi_cat)

# Make sure "below" is the reference factor
str(sample_data(PSOtmao5)$tmao_mdn)
##  Factor w/ 2 levels "above","below": 2 2 1 2 1 2 2 2 2 1 ...
sample_data(PSOtmao5)$tmao_mdn <- relevel(sample_data(PSOtmao5)$tmao_mdn, ref = "below")

DESeq2

From the DESeq2 documentation, “Note: In order to benefit from the default settings of the package, you should put the variable of interest at the end of the formula and make sure the control level is the first level.”

# Grab the sample data to easily check the structure of your covariates, age, sex, and BMI
sd <- sample_data(PSOtmao5)
str(sd$sex)
##  int [1:355] 1 2 2 2 2 2 2 2 2 1 ...
str(sd$bmi_final.x)
##  num [1:355] 20.8 23 21.6 27.5 25.9 ...
# Factor sex, BMI okay
sample_data(PSOtmao5)$sex = factor(sample_data(PSOtmao5)$sex)

# DESeq analysis
mb <- phyloseq_to_deseq2(PSOtmao5, ~ sex:age + bmi_final.x + tmao_mdn)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
##   the design formula contains one or more numeric variables that have mean or
##   standard deviation larger than 5 (an arbitrary threshold to trigger this message).
##   Including numeric variables with large mean can induce collinearity with the intercept.
##   Users should center and scale numeric variables in the design to improve GLM convergence.
#mb2 <- estimateSizeFactors(mb, type="poscount")
mb3 = DESeq(mb, test="Wald", fitType="parametric")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Results
res = results(mb3, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(PSOtmao5)[rownames(sigtab), ], "matrix"))
sigtab$family_genus <- paste0(sigtab$Family, sigtab$Genus)
head(sigtab)
##                                   baseMean log2FoldChange       lfcSE      stat
## a4bb7f5133099cd74c3817b8d3383326  15.21719     0.04586022 0.014146267  3.241860
## 7363aac88b5325ac49cb469e284e10dd  13.47477     0.07071319 0.021436916  3.298664
## e1a2800b24cdf9779b28dc897cddb12a  64.04318     0.04589202 0.016491688  2.782736
## 172049e46605909f99651e4d0efb8578 144.35632     0.01807500 0.006445167  2.804426
## 8c6b152535efbebf12179855cb31b8d8 147.71351     0.02207538 0.007428151  2.971854
## e4ae256bb51896c21795f743dc9ed9dd 438.94106    -0.02536227 0.006051082 -4.191362
##                                        pvalue        padj     Kingdom
## a4bb7f5133099cd74c3817b8d3383326 1.187523e-03 0.016625322 k__Bacteria
## 7363aac88b5325ac49cb469e284e10dd 9.714596e-04 0.016625322 k__Bacteria
## e1a2800b24cdf9779b28dc897cddb12a 5.390264e-03 0.037731849 k__Bacteria
## 172049e46605909f99651e4d0efb8578 5.040621e-03 0.037731849 k__Bacteria
## 8c6b152535efbebf12179855cb31b8d8 2.960069e-03 0.031080728 k__Bacteria
## e4ae256bb51896c21795f743dc9ed9dd 2.772851e-05 0.001164597 k__Bacteria
##                                              Phylum              Class
## a4bb7f5133099cd74c3817b8d3383326  p__Actinobacteria  c__Coriobacteriia
## 7363aac88b5325ac49cb469e284e10dd      p__Firmicutes         c__Bacilli
## e1a2800b24cdf9779b28dc897cddb12a      p__Firmicutes      c__Clostridia
## 172049e46605909f99651e4d0efb8578      p__Firmicutes      c__Clostridia
## 8c6b152535efbebf12179855cb31b8d8      p__Firmicutes      c__Clostridia
## e4ae256bb51896c21795f743dc9ed9dd      p__Firmicutes      c__Clostridia
##                                                 Order                  Family
## a4bb7f5133099cd74c3817b8d3383326  o__Coriobacteriales    f__Coriobacteriaceae
## 7363aac88b5325ac49cb469e284e10dd   o__Lactobacillales     f__Streptococcaceae
## e1a2800b24cdf9779b28dc897cddb12a     o__Clostridiales  f__Christensenellaceae
## 172049e46605909f99651e4d0efb8578     o__Clostridiales       f__Clostridiaceae
## 8c6b152535efbebf12179855cb31b8d8     o__Clostridiales      f__Lachnospiraceae
## e4ae256bb51896c21795f743dc9ed9dd     o__Clostridiales      f__Lachnospiraceae
##                                               Genus Species
## a4bb7f5133099cd74c3817b8d3383326                g__    <NA>
## 7363aac88b5325ac49cb469e284e10dd     g__Lactococcus    <NA>
## e1a2800b24cdf9779b28dc897cddb12a                g__    <NA>
## 172049e46605909f99651e4d0efb8578     g__Clostridium    <NA>
## 8c6b152535efbebf12179855cb31b8d8     g__Lachnospira    <NA>
## e4ae256bb51896c21795f743dc9ed9dd  g__[Ruminococcus]    <NA>
##                                                           family_genus
## a4bb7f5133099cd74c3817b8d3383326              f__Coriobacteriaceae g__
## 7363aac88b5325ac49cb469e284e10dd    f__Streptococcaceae g__Lactococcus
## e1a2800b24cdf9779b28dc897cddb12a            f__Christensenellaceae g__
## 172049e46605909f99651e4d0efb8578      f__Clostridiaceae g__Clostridium
## 8c6b152535efbebf12179855cb31b8d8     f__Lachnospiraceae g__Lachnospira
## e4ae256bb51896c21795f743dc9ed9dd  f__Lachnospiraceae g__[Ruminococcus]
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
ggplot(sigtabgen, aes(y=family_genus, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=3) + 
  xlab("Taxonomy") +
  ylab("Log2 Fold Change") + 
  scale_color_brewer(palette = "Dark2") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

Alpha Diversity Analysis

We need to recalculate the alpha diversity metrics now that we have removed and glommed the taxa data.

#plot_richness(PSOtmao4, x="tmao_mdn", color="bmi_cat", measures=c("Chao1", "Shannon"))

# Recalculate alpha diversity now that some taxonomy have been filtered out
updated_alpha <- estimate_richness(PSOtmao4)
row.names(updated_alpha) <- gsub("X", "", row.names(updated_alpha))

First, load the master phenotype file so that we can have complete access to our covariate variables. Merge the master phenotyp file with the sample data from the phyloseq object.

# Load master pheno file
phen <- readRDS("../../data/processed/phenotype/Phenotypes_211108.rds")
phen <- subset(phen, select = -c(sex, age, age_cat, bmi_cat))

# Check bin structure and factor if needed
#str(phen$age_cat)
#phen$age_cat <- as.factor(phen$age_cat)
#str(phen$bmi_cat)phen$bmi_cat <- as.factor(phen$bmi_cat)

# Grab PSO sample data
df <- as(sample_data(PSOtmao5), "data.frame")

# Merge
df2 <- merge(phen, df, by = 0)
row.names(df2) <- df2[,"Row.names"]
df2 <- subset(df2, select =-c(Row.names))

# Merge updated alpha
df_alpha2 <- merge(updated_alpha, df2, by = 0)
row.names(df_alpha2) <- df_alpha2[,"Row.names"]
df_alpha2 <- subset(df_alpha2, select =-c(Row.names))

# Original alpha
summary(lm(shannon ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df2))
## 
## Call:
## lm(formula = shannon ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98513 -0.27222  0.07201  0.38006  1.12405 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.144387   0.228419  22.522  < 2e-16 ***
## tmao_log       0.211111   0.060676   3.479 0.000568 ***
## sex2           0.195224   0.190542   1.025 0.306298    
## age            0.008583   0.003284   2.614 0.009359 ** 
## cystatinc_bd1 -0.242566   0.224313  -1.081 0.280300    
## ethnicity.L   -0.251200   0.132485  -1.896 0.058801 .  
## ethnicity.Q    0.125637   0.133695   0.940 0.348029    
## ethnicity.C   -0.064601   0.118927  -0.543 0.587347    
## ethnicity^4    0.065687   0.110580   0.594 0.552894    
## ethnicity^5   -0.128186   0.114491  -1.120 0.263669    
## sex2:age      -0.003636   0.004430  -0.821 0.412293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5542 on 339 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1729, Adjusted R-squared:  0.1485 
## F-statistic: 7.088 on 10 and 339 DF,  p-value: 3.857e-10
# Updated alpha
summary(lm(df_alpha2$Shannon ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = df_alpha2$Shannon ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.45905 -0.12780  0.05385  0.19005  0.61243 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.2120178  0.1228891  18.000  < 2e-16 ***
## tmao_log       0.0959369  0.0326537   2.938  0.00353 ** 
## sex2           0.0205260  0.1025649   0.200  0.84150    
## age            0.0028624  0.0017666   1.620  0.10610    
## cystatinc_bd1 -0.0887005  0.1206672  -0.735  0.46279    
## ethnicity.L   -0.1398545  0.0713168  -1.961  0.05069 .  
## ethnicity.Q   -0.0167910  0.0719726  -0.233  0.81567    
## ethnicity.C   -0.0603182  0.0640218  -0.942  0.34678    
## ethnicity^4    0.0304270  0.0595321   0.511  0.60961    
## ethnicity^5   -0.0177515  0.0616383  -0.288  0.77353    
## sex2:age      -0.0006458  0.0023829  -0.271  0.78655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2983 on 341 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.09236,    Adjusted R-squared:  0.06574 
## F-statistic:  3.47 on 10 and 341 DF,  p-value: 0.0002307

Check the distributions of the alpha diversity metrics and run the multiple linear regression models.

# Check distribution
shapiro.test(df2$shannon) # not normal, proceed with caution
## 
##  Shapiro-Wilk normality test
## 
## data:  df2$shannon
## W = 0.905, p-value = 4.423e-14
shapiro.test(log(df2$shannon))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(df2$shannon)
## W = 0.84813, p-value < 2.2e-16
# Run linear regression
summary(lm(shannon ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = shannon ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98513 -0.27222  0.07201  0.38006  1.12405 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.144387   0.228419  22.522  < 2e-16 ***
## tmao_log       0.211111   0.060676   3.479 0.000568 ***
## sex2           0.195224   0.190542   1.025 0.306298    
## age            0.008583   0.003284   2.614 0.009359 ** 
## cystatinc_bd1 -0.242566   0.224313  -1.081 0.280300    
## ethnicity.L   -0.251200   0.132485  -1.896 0.058801 .  
## ethnicity.Q    0.125637   0.133695   0.940 0.348029    
## ethnicity.C   -0.064601   0.118927  -0.543 0.587347    
## ethnicity^4    0.065687   0.110580   0.594 0.552894    
## ethnicity^5   -0.128186   0.114491  -1.120 0.263669    
## sex2:age      -0.003636   0.004430  -0.821 0.412293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5542 on 339 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1729, Adjusted R-squared:  0.1485 
## F-statistic: 7.088 on 10 and 339 DF,  p-value: 3.857e-10
# Check distribution
shapiro.test(df2$faith_pd) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  df2$faith_pd
## W = 0.99699, p-value = 0.7601
# Run linear regression
summary(lm(faith_pd ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = faith_pd ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7613 -0.8853 -0.0213  0.9188  4.3240 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.100608   0.602002  11.795  < 2e-16 ***
## tmao_log       0.515615   0.159913   3.224  0.00139 ** 
## sex2           0.178380   0.502178   0.355  0.72265    
## age            0.022292   0.008655   2.576  0.01043 *  
## cystatinc_bd1 -0.475666   0.591182  -0.805  0.42161    
## ethnicity.L   -1.059985   0.349166  -3.036  0.00259 ** 
## ethnicity.Q   -0.157923   0.352357  -0.448  0.65430    
## ethnicity.C   -0.038737   0.313434  -0.124  0.90171    
## ethnicity^4   -0.186550   0.291436  -0.640  0.52254    
## ethnicity^5   -0.787186   0.301743  -2.609  0.00949 ** 
## sex2:age      -0.010342   0.011674  -0.886  0.37633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.461 on 339 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1669, Adjusted R-squared:  0.1423 
## F-statistic: 6.791 on 10 and 339 DF,  p-value: 1.156e-09
# Check distribution
shapiro.test(df2$pielou_e) # not normal, proceed with caution
## 
##  Shapiro-Wilk normality test
## 
## data:  df2$pielou_e
## W = 0.81544, p-value < 2.2e-16
# Run linear regression
summary(lm(pielou_e ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = pielou_e ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28196 -0.01617  0.01120  0.03592  0.09891 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.7441719  0.0246064  30.243   <2e-16 ***
## tmao_log       0.0154432  0.0065363   2.363   0.0187 *  
## sex2           0.0272373  0.0205262   1.327   0.1854    
## age            0.0006486  0.0003538   1.833   0.0676 .  
## cystatinc_bd1 -0.0161791  0.0241641  -0.670   0.5036    
## ethnicity.L   -0.0102842  0.0142719  -0.721   0.4717    
## ethnicity.Q    0.0221684  0.0144023   1.539   0.1247    
## ethnicity.C   -0.0060722  0.0128114  -0.474   0.6358    
## ethnicity^4    0.0105841  0.0119122   0.889   0.3749    
## ethnicity^5   -0.0011467  0.0123335  -0.093   0.9260    
## sex2:age      -0.0003379  0.0004772  -0.708   0.4794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0597 on 339 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1163, Adjusted R-squared:  0.09023 
## F-statistic: 4.461 on 10 and 339 DF,  p-value: 6.363e-06
# Check distribution
shapiro.test(df2$observed_otus) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  df2$observed_otus
## W = 0.99609, p-value = 0.5367
# Run linear regression
summary(lm(observed_otus ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = observed_otus ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.331 -20.287  -0.707  18.771  88.101 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   120.8853    12.3560   9.783  < 2e-16 ***
## tmao_log       11.5165     3.2822   3.509 0.000511 ***
## sex2            1.4291    10.3072   0.139 0.889805    
## age             0.4999     0.1776   2.814 0.005181 ** 
## cystatinc_bd1 -11.3839    12.1340  -0.938 0.348818    
## ethnicity.L   -18.0996     7.1666  -2.526 0.012007 *  
## ethnicity.Q    -2.2445     7.2321  -0.310 0.756481    
## ethnicity.C    -1.7351     6.4332  -0.270 0.787550    
## ethnicity^4    -0.2377     5.9817  -0.040 0.968328    
## ethnicity^5   -14.0427     6.1932  -2.267 0.023994 *  
## sex2:age       -0.1737     0.2396  -0.725 0.469082    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.98 on 339 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1683, Adjusted R-squared:  0.1437 
## F-statistic: 6.858 on 10 and 339 DF,  p-value: 9.011e-10
#~~~~~~~~~~~
# Shannon
#~~~~~~~~~~~
# Check distribution
shapiro.test(df_alpha2$Shannon) # not normal, proceed with caution
## 
##  Shapiro-Wilk normality test
## 
## data:  df_alpha2$Shannon
## W = 0.88221, p-value = 7.335e-16
shapiro.test((df_alpha2$shannon)^.5) # not better
## 
##  Shapiro-Wilk normality test
## 
## data:  (df_alpha2$shannon)^0.5
## W = 0.87856, p-value = 4.567e-16
# Run linear regression
summary(lm(Shannon ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = Shannon ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.45905 -0.12780  0.05385  0.19005  0.61243 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.2120178  0.1228891  18.000  < 2e-16 ***
## tmao_log       0.0959369  0.0326537   2.938  0.00353 ** 
## sex2           0.0205260  0.1025649   0.200  0.84150    
## age            0.0028624  0.0017666   1.620  0.10610    
## cystatinc_bd1 -0.0887005  0.1206672  -0.735  0.46279    
## ethnicity.L   -0.1398545  0.0713168  -1.961  0.05069 .  
## ethnicity.Q   -0.0167910  0.0719726  -0.233  0.81567    
## ethnicity.C   -0.0603182  0.0640218  -0.942  0.34678    
## ethnicity^4    0.0304270  0.0595321   0.511  0.60961    
## ethnicity^5   -0.0177515  0.0616383  -0.288  0.77353    
## sex2:age      -0.0006458  0.0023829  -0.271  0.78655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2983 on 341 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.09236,    Adjusted R-squared:  0.06574 
## F-statistic:  3.47 on 10 and 341 DF,  p-value: 0.0002307
#~~~~~~~~~~~
# Chao1
#~~~~~~~~~~~
# Check distribution
shapiro.test(df_alpha2$Chao1) # ehh not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  df_alpha2$Chao1
## W = 0.9408, p-value = 1.065e-10
# Run linear regression
summary(lm(Chao1 ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = Chao1 ~ tmao_log + sex * age + cystatinc_bd1 + ethnicity, 
##     data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2537  -1.7308   0.1194   2.7627   8.3159 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   29.894387   1.589163  18.811   <2e-16 ***
## tmao_log       0.770304   0.422267   1.824   0.0690 .  
## sex2          -0.568427   1.326337  -0.429   0.6685    
## age            0.007973   0.022845   0.349   0.7273    
## cystatinc_bd1 -1.839881   1.560430  -1.179   0.2392    
## ethnicity.L   -1.929598   0.922246  -2.092   0.0372 *  
## ethnicity.Q   -0.502759   0.930727  -0.540   0.5894    
## ethnicity.C   -1.725791   0.827909  -2.085   0.0379 *  
## ethnicity^4   -0.851071   0.769850  -1.106   0.2697    
## ethnicity^5   -1.619154   0.797087  -2.031   0.0430 *  
## sex2:age      -0.003494   0.030814  -0.113   0.9098    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.858 on 341 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.08515,    Adjusted R-squared:  0.05832 
## F-statistic: 3.174 on 10 and 341 DF,  p-value: 0.0006576
#~~~~~~~~~~~
# InvSimpson
#~~~~~~~~~~~
# Check distribution
shapiro.test(df_alpha2$InvSimpson) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  df_alpha2$InvSimpson
## W = 0.99288, p-value = 0.09029
# Run linear regression
summary(lm(InvSimpson ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = InvSimpson ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.326 -1.430  0.085  1.543  7.086 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.61379    0.99740   6.631 1.31e-10 ***
## tmao_log       0.58380    0.26503   2.203   0.0283 *  
## sex2          -0.47008    0.83244  -0.565   0.5726    
## age            0.01121    0.01434   0.782   0.4349    
## cystatinc_bd1 -0.17930    0.97936  -0.183   0.8548    
## ethnicity.L   -0.65807    0.57882  -1.137   0.2564    
## ethnicity.Q    0.33403    0.58415   0.572   0.5678    
## ethnicity.C    0.24684    0.51962   0.475   0.6351    
## ethnicity^4    0.30597    0.48318   0.633   0.5270    
## ethnicity^5    0.13440    0.50027   0.269   0.7884    
## sex2:age       0.01118    0.01934   0.578   0.5637    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.421 on 341 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.0577, Adjusted R-squared:  0.03006 
## F-statistic: 2.088 on 10 and 341 DF,  p-value: 0.02485
#~~~~~~~~~~~
# Observed
#~~~~~~~~~~~
# Check distribution
shapiro.test(df_alpha2$Observed) # eh not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  df_alpha2$Observed
## W = 0.9408, p-value = 1.065e-10
# Run linear regression
summary(lm(Observed ~ tmao_log + sex*age + cystatinc_bd1 + ethnicity, df_alpha2))
## 
## Call:
## lm(formula = Observed ~ tmao_log + sex * age + cystatinc_bd1 + 
##     ethnicity, data = df_alpha2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2537  -1.7308   0.1194   2.7627   8.3159 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   29.894387   1.589163  18.811   <2e-16 ***
## tmao_log       0.770304   0.422267   1.824   0.0690 .  
## sex2          -0.568427   1.326337  -0.429   0.6685    
## age            0.007973   0.022845   0.349   0.7273    
## cystatinc_bd1 -1.839881   1.560430  -1.179   0.2392    
## ethnicity.L   -1.929598   0.922246  -2.092   0.0372 *  
## ethnicity.Q   -0.502759   0.930727  -0.540   0.5894    
## ethnicity.C   -1.725791   0.827909  -2.085   0.0379 *  
## ethnicity^4   -0.851071   0.769850  -1.106   0.2697    
## ethnicity^5   -1.619154   0.797087  -2.031   0.0430 *  
## sex2:age      -0.003494   0.030814  -0.113   0.9098    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.858 on 341 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.08515,    Adjusted R-squared:  0.05832 
## F-statistic: 3.174 on 10 and 341 DF,  p-value: 0.0006576

Alpha Diversity Plots

Make plots that demonstrate the relationship between TMAO and the alpha-diversity metrics.

# Shannon
y_variable <- "Shannon"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=shannon)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

# Faith's Phylogenetic Diversity
y_variable <- "Faith's Phylogenetic Diversity"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=faith_pd)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

# Pielou's Evenness
y_variable <- "Pielou's Evenness"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=pielou_e)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

# Observed OTUs
y_variable <- "Observed OTUs"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=observed_otus)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

# Shannon
y_variable <- "Shannon"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=Shannon)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'

# Faith's Phylogenetic Diversity
y_variable <- "Chao1"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=Chao1)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'

# Pielou's Evenness
y_variable <- "Inverse Simpson"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=InvSimpson)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'

# Observed OTUs
y_variable <- "Observed OTUs"
x_variable <- "Natural Log of TMAO (uM)"
mdn_cut <- log(median(df_alpha2$tmao.x))
# Plot
ggplot(df_alpha2, aes(x=tmao_log, y=Observed)) +
  geom_point(aes(shape=bmi_cat, color = age_cat)) +
  geom_vline(xintercept = mdn_cut, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, fullrange = F) +
  theme_minimal() + 
  ylab(y_variable) +
  xlab(x_variable) +
  labs(color = "Age bin", shape = "BMI bin") +
  scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using formula 'y ~ x'

Beta Diversity Analysis

# PCoA
PCoA_WU <- ordinate(PSOtmao5, "PCoA", "wunifrac")
PCoA_UnWU <- ordinate(PSOtmao5, "PCoA", "uunifrac")
PCoA_BC <- ordinate(PSOtmao5, "PCoA", "bray")

# Calculate distance matrix
dismax_wu = phyloseq::distance(PSOtmao5, method="wunifrac", type = "samples")
dismax_uwu = phyloseq::distance(PSOtmao5, method="unifrac", type = "samples")
dismax_bc = phyloseq::distance(PSOtmao5, method="bray", type = "samples")
# Get sample_data
df <- as(sample_data(PSOtmao5), "data.frame")
df$tmao_mdn <- relevel(df$tmao_mdn, ref = "below")

#~~~~~~~~~~~
# Weighted
#~~~~~~~~~~~
# Beta dispersion
bdis <- betadisper(dismax_wu, df$tmao_mdn)
permutest(bdis) # NS, good
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)  
## Groups      1 0.01125 0.0112527 2.5727    999  0.088 .
## Residuals 353 1.54399 0.0043739                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(bdis, las = 2)

# ADONIS (perMANOVA) analysis
adonis2(dismax_wu ~ tmao_log + sex:age + bmi_final.y, data=df, method = "wunifrac", perm=999) # .025
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dismax_wu ~ tmao_log + sex:age + bmi_final.y, data = df, permutations = 999, method = "wunifrac")
##              Df SumOfSqs      R2      F Pr(>F)  
## tmao_log      1   0.0765 0.00845 3.0316  0.030 *
## bmi_final.y   1   0.0372 0.00411 1.4749  0.180  
## sex:age       2   0.1046 0.01155 2.0712  0.046 *
## Residual    350   8.8353 0.97588                
## Total       354   9.0536 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#~~~~~~~~~~~
# Unweighted
#~~~~~~~~~~~
# Beta dispersion
bdis <- betadisper(dismax_uwu, df$tmao_mdn)
permutest(bdis) # NS, good
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      1 0.00353 0.0035268 1.0259    999  0.321
## Residuals 353 1.21359 0.0034379
boxplot(bdis, las = 2)

# ADONIS (perMANOVA) analysis
adonis2(dismax_uwu ~ tmao_log + sex:age + bmi_final.y, data=df, method = "unifrac", perm=999) # .001
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dismax_uwu ~ tmao_log + sex:age + bmi_final.y, data = df, permutations = 999, method = "unifrac")
##              Df SumOfSqs      R2      F Pr(>F)    
## tmao_log      1   0.2004 0.01457 5.2751  0.001 ***
## bmi_final.y   1   0.0549 0.00399 1.4450  0.173    
## sex:age       2   0.2058 0.01496 2.7089  0.003 ** 
## Residual    350  13.2951 0.96648                  
## Total       354  13.7562 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#~~~~~~~~~~~
# BrayCurtis
#~~~~~~~~~~~
# Beta dispersion
bdis <- betadisper(dismax_bc, df$tmao_mdn)
permutest(bdis) # NS, good
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      1 0.02125 0.0212487 2.4195    999  0.112
## Residuals 353 3.10015 0.0087823
boxplot(bdis, las = 2)

# ADONIS (perMANOVA) analysis
adonis2(dismax_bc ~ tmao_log + sex:age + bmi_final.y, data=df, method = "bray", perm=999) # .017
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dismax_bc ~ tmao_log + sex:age + bmi_final.y, data = df, permutations = 999, method = "bray")
##              Df SumOfSqs      R2      F Pr(>F)    
## tmao_log      1    0.254 0.00665 2.4090  0.015 *  
## bmi_final.y   1    0.206 0.00539 1.9512  0.039 *  
## sex:age       2    0.814 0.02132 3.8595  0.001 ***
## Residual    350   36.924 0.96664                  
## Total       354   38.199 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANCOM

source("../../Rscripts/ancom_v2.1.R")
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## The following object is masked from 'package:IRanges':
## 
##     collapse
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
## 
## Attaching package: 'compositions'
## The following objects are masked from 'package:IRanges':
## 
##     cor, cov, var
## The following objects are masked from 'package:S4Vectors':
## 
##     cor, cov, var
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, var
## The following objects are masked from 'package:stats':
## 
##     anova, cor, cov, dist, var
## The following objects are masked from 'package:base':
## 
##     %*%, norm, scale, scale.default
# OTU table
MicroData <- otu_table(PSOtmao5)
MicroData <- as.data.frame(as.matrix(MicroData), stringsAsFactors = F)
# Add 1
#MicroData2 <- MicroData +1
#MicroData2 <- as.data.frame(as.matrix(t(MicroData2),stringsAsFactors = F))
#dim(MicroData2) # 103 x 98

# Load the phenotype data
#------------------------
# Perform ANCOM analysis
# Step 1: Data preprocessing
# OTU data
otu_data <- otu_table(PSOtmao5)
otu_data <- as.data.frame(as.matrix(otu_data), stringsAsFactors = F)
otu_id <- rownames(otu_data)

# Metadata 
#df <- readRDS("../data_processed/mapping/Biocrates_mapping_BD1_210809.rds")
#df$p.Cresol.SO4_bd1_mdn <- plyr::revalue(df$p.Cresol.SO4_bd1_mdn, c("0" = "< Median", "1" = "> Median"))
# Subset
MetaData2 <- df_alpha2 %>% dplyr::select("tmao_mdn")
MetaData2$Sample.ID <- rownames(MetaData2)

feature_table = otu_data; sample_var = "Sample.ID"; group_var = NULL
out_cut = 0.05; zero_cut = 0.95; lib_cut = 1000; neg_lb = FALSE # make zero_cut=0.95 to match criteria used for other metabolites
# Function
prepro = feature_table_pre_process(feature_table = feature_table,
                                   meta_data = MetaData2,
                                   sample_var = sample_var,
                                   group_var = NULL,
                                   out_cut = out_cut,
                                   zero_cut = zero_cut,
                                   lib_cut = lib_cut,
                                   neg_lb = neg_lb)

# Step 2: ANCOM
feature_table = prepro$feature_table # preprocessing went from 98 to 60 bugs
meta_data = prepro$meta_data
struc_zero = prepro$structure_zeros

# Define variables for ANCOM
main_var = "tmao_mdn"; p_adj_method = "BH"; alpha = 0.05
adj_formula = NULL; rand_formula = NULL
# Run ANCCOM
t_start = Sys.time()
res = ANCOM(feature_table = feature_table, 
            meta_data = meta_data, 
            struc_zero = struc_zero, 
            main_var = main_var, 
            p_adj_method = p_adj_method, 
            alpha = alpha, 
            adj_formula = adj_formula, 
            rand_formula = rand_formula)
t_end = Sys.time()
t_run = t_end - t_start 
t_run # around 5s
## Time difference of 2.453642 secs
# Results
res$out
##                             taxa_id  W detected_0.9 detected_0.8 detected_0.7
## 1  7958dd98cfbbdc096240cd1dc4531423 18        FALSE        FALSE        FALSE
## 2  05404c9fdf9f3f334eb618bac3f434f6  0        FALSE        FALSE        FALSE
## 3  06105df60508c2ed24a54f1b8ed64e49  0        FALSE        FALSE        FALSE
## 4  bb1b75f41ff9c9db1d1de41e8388eb52  0        FALSE        FALSE        FALSE
## 5  db8f48a3fe2fca95fb4986f5507b9076  0        FALSE        FALSE        FALSE
## 6  3f5533292a655d0f54a64560d65e823b  0        FALSE        FALSE        FALSE
## 7  803eb52cfe3d77bdf9fe14c011e425fb  0        FALSE        FALSE        FALSE
## 8  a4bb7f5133099cd74c3817b8d3383326  0        FALSE        FALSE        FALSE
## 9  a41766e0533ab64a9966f362a7938478  0        FALSE        FALSE        FALSE
## 10 5f0cbc930515c7ec606289b79d8c4ba3  0        FALSE        FALSE        FALSE
## 11 8fabb679415fe6a9969015076873e211 26        FALSE        FALSE        FALSE
## 12 7363aac88b5325ac49cb469e284e10dd  1        FALSE        FALSE        FALSE
## 13 2dc2cf60102ff00cd077f2304ec84d06  0        FALSE        FALSE        FALSE
## 14 04badf3377fec5aadb882276e3b3ca4f 11        FALSE        FALSE        FALSE
## 15 ebf61ede654f305596908efa9a8ddf54  4        FALSE        FALSE        FALSE
## 16 a3f9e91d15a189d0cf1e0596b9688957  8        FALSE        FALSE        FALSE
## 17 5b526c732aa8f6b7ddbf6b35925d5011  0        FALSE        FALSE        FALSE
## 18 b180a2455b236c103cf0bfe620095736 17        FALSE        FALSE        FALSE
## 19 e1a2800b24cdf9779b28dc897cddb12a  4        FALSE        FALSE        FALSE
## 20 f2cd13764c9588bd921f3a53138830da  2        FALSE        FALSE        FALSE
## 21 a7eb050200d9b2408521cc7ae42327ba 18        FALSE        FALSE        FALSE
## 22 4a4d11bb32e68079b4c71483bf0fcf36  1        FALSE        FALSE        FALSE
## 23 6960eba3db7d4d863d042ab497d7481a  0        FALSE        FALSE        FALSE
## 24 a38b6324c57e2ef3c842180166aeb60f  0        FALSE        FALSE        FALSE
## 25 172049e46605909f99651e4d0efb8578  2        FALSE        FALSE        FALSE
## 26 6e13c195d4554dd8cf4923df9decd183  0        FALSE        FALSE        FALSE
## 27 d2a695aa271adf072749a729fd96a731  4        FALSE        FALSE        FALSE
## 28 520eb0909e5157503fbed62cbb4e73ca 33        FALSE         TRUE         TRUE
## 29 42de4e368cca3ff50954f82c12dd5315  2        FALSE        FALSE        FALSE
## 30 f966e124604e0e32b209b88df6e42cd4  1        FALSE        FALSE        FALSE
## 31 f7c08b0d8e574f7c089b95e0e4344d5e  0        FALSE        FALSE        FALSE
## 32 1dda53416f231a3345668df39d4ae780  1        FALSE        FALSE        FALSE
## 33 8c6b152535efbebf12179855cb31b8d8  0        FALSE        FALSE        FALSE
## 34 8aa67fe08404e3d574aafb6622786c0f  4        FALSE        FALSE        FALSE
## 35 ec70d97b11476164cbd828e126ad10da  0        FALSE        FALSE        FALSE
## 36 5c4ca852b40641b3eb0ad23e69bb6583  3        FALSE        FALSE        FALSE
## 37 ebe91a7d912f432d8726b0cef99d18c3  1        FALSE        FALSE        FALSE
## 38 9a299c48d0e3ddea2122806528070d6e  1        FALSE        FALSE        FALSE
## 39 e4ae256bb51896c21795f743dc9ed9dd  0        FALSE        FALSE        FALSE
## 40 7a297761643e32391ea7cd03b0995e62  1        FALSE        FALSE        FALSE
## 41 73b4f14d16f02ee0ac82838166128de4  0        FALSE        FALSE        FALSE
## 42 e00f85cb3452e37943500e86afb268f4  2        FALSE        FALSE        FALSE
##    detected_0.6
## 1         FALSE
## 2         FALSE
## 3         FALSE
## 4         FALSE
## 5         FALSE
## 6         FALSE
## 7         FALSE
## 8         FALSE
## 9         FALSE
## 10        FALSE
## 11         TRUE
## 12        FALSE
## 13        FALSE
## 14        FALSE
## 15        FALSE
## 16        FALSE
## 17        FALSE
## 18        FALSE
## 19        FALSE
## 20        FALSE
## 21        FALSE
## 22        FALSE
## 23        FALSE
## 24        FALSE
## 25        FALSE
## 26        FALSE
## 27        FALSE
## 28         TRUE
## 29        FALSE
## 30        FALSE
## 31        FALSE
## 32        FALSE
## 33        FALSE
## 34        FALSE
## 35        FALSE
## 36        FALSE
## 37        FALSE
## 38        FALSE
## 39        FALSE
## 40        FALSE
## 41        FALSE
## 42        FALSE
res$fig$data
##                                                           taxa_id            x
## 7958dd98cfbbdc096240cd1dc4531423 7958dd98cfbbdc096240cd1dc4531423 -0.280831110
## 05404c9fdf9f3f334eb618bac3f434f6 05404c9fdf9f3f334eb618bac3f434f6 -0.109947446
## 06105df60508c2ed24a54f1b8ed64e49 06105df60508c2ed24a54f1b8ed64e49 -0.144866687
## bb1b75f41ff9c9db1d1de41e8388eb52 bb1b75f41ff9c9db1d1de41e8388eb52 -0.151237650
## db8f48a3fe2fca95fb4986f5507b9076 db8f48a3fe2fca95fb4986f5507b9076 -0.216337464
## 3f5533292a655d0f54a64560d65e823b 3f5533292a655d0f54a64560d65e823b -0.190366814
## 803eb52cfe3d77bdf9fe14c011e425fb 803eb52cfe3d77bdf9fe14c011e425fb  0.028090931
## a4bb7f5133099cd74c3817b8d3383326 a4bb7f5133099cd74c3817b8d3383326 -0.218582804
## a41766e0533ab64a9966f362a7938478 a41766e0533ab64a9966f362a7938478 -0.103103094
## 5f0cbc930515c7ec606289b79d8c4ba3 5f0cbc930515c7ec606289b79d8c4ba3 -0.006987765
## 8fabb679415fe6a9969015076873e211 8fabb679415fe6a9969015076873e211  0.435332755
## 7363aac88b5325ac49cb469e284e10dd 7363aac88b5325ac49cb469e284e10dd -0.355323392
## 2dc2cf60102ff00cd077f2304ec84d06 2dc2cf60102ff00cd077f2304ec84d06 -0.117759192
## 04badf3377fec5aadb882276e3b3ca4f 04badf3377fec5aadb882276e3b3ca4f  0.185791740
## ebf61ede654f305596908efa9a8ddf54 ebf61ede654f305596908efa9a8ddf54  0.102501694
## a3f9e91d15a189d0cf1e0596b9688957 a3f9e91d15a189d0cf1e0596b9688957  0.141465628
## 5b526c732aa8f6b7ddbf6b35925d5011 5b526c732aa8f6b7ddbf6b35925d5011 -0.064884721
## b180a2455b236c103cf0bfe620095736 b180a2455b236c103cf0bfe620095736 -0.215089384
## e1a2800b24cdf9779b28dc897cddb12a e1a2800b24cdf9779b28dc897cddb12a  0.134141150
## f2cd13764c9588bd921f3a53138830da f2cd13764c9588bd921f3a53138830da  0.089633411
## a7eb050200d9b2408521cc7ae42327ba a7eb050200d9b2408521cc7ae42327ba  0.284550374
## 4a4d11bb32e68079b4c71483bf0fcf36 4a4d11bb32e68079b4c71483bf0fcf36 -0.013929624
## 6960eba3db7d4d863d042ab497d7481a 6960eba3db7d4d863d042ab497d7481a  0.051435474
## a38b6324c57e2ef3c842180166aeb60f a38b6324c57e2ef3c842180166aeb60f  0.024368828
## 172049e46605909f99651e4d0efb8578 172049e46605909f99651e4d0efb8578  0.110678577
## 6e13c195d4554dd8cf4923df9decd183 6e13c195d4554dd8cf4923df9decd183 -0.037903867
## d2a695aa271adf072749a729fd96a731 d2a695aa271adf072749a729fd96a731  0.177269044
## 520eb0909e5157503fbed62cbb4e73ca 520eb0909e5157503fbed62cbb4e73ca  0.328926258
## 42de4e368cca3ff50954f82c12dd5315 42de4e368cca3ff50954f82c12dd5315  0.169326544
## f966e124604e0e32b209b88df6e42cd4 f966e124604e0e32b209b88df6e42cd4 -0.032098231
## f7c08b0d8e574f7c089b95e0e4344d5e f7c08b0d8e574f7c089b95e0e4344d5e  0.034876899
## 1dda53416f231a3345668df39d4ae780 1dda53416f231a3345668df39d4ae780  0.097444505
## 8c6b152535efbebf12179855cb31b8d8 8c6b152535efbebf12179855cb31b8d8 -0.008875955
## 8aa67fe08404e3d574aafb6622786c0f 8aa67fe08404e3d574aafb6622786c0f  0.056769543
## ec70d97b11476164cbd828e126ad10da ec70d97b11476164cbd828e126ad10da -0.020093592
## 5c4ca852b40641b3eb0ad23e69bb6583 5c4ca852b40641b3eb0ad23e69bb6583 -0.153234669
## ebe91a7d912f432d8726b0cef99d18c3 ebe91a7d912f432d8726b0cef99d18c3  0.096916247
## 9a299c48d0e3ddea2122806528070d6e 9a299c48d0e3ddea2122806528070d6e -0.016568400
## e4ae256bb51896c21795f743dc9ed9dd e4ae256bb51896c21795f743dc9ed9dd -0.109947595
## 7a297761643e32391ea7cd03b0995e62 7a297761643e32391ea7cd03b0995e62  0.090847398
## 73b4f14d16f02ee0ac82838166128de4 73b4f14d16f02ee0ac82838166128de4 -0.055388395
## e00f85cb3452e37943500e86afb268f4 e00f85cb3452e37943500e86afb268f4 -0.017009151
##                                   y zero_ind
## 7958dd98cfbbdc096240cd1dc4531423 18       No
## 05404c9fdf9f3f334eb618bac3f434f6  0       No
## 06105df60508c2ed24a54f1b8ed64e49  0       No
## bb1b75f41ff9c9db1d1de41e8388eb52  0       No
## db8f48a3fe2fca95fb4986f5507b9076  0       No
## 3f5533292a655d0f54a64560d65e823b  0       No
## 803eb52cfe3d77bdf9fe14c011e425fb  0       No
## a4bb7f5133099cd74c3817b8d3383326  0       No
## a41766e0533ab64a9966f362a7938478  0       No
## 5f0cbc930515c7ec606289b79d8c4ba3  0       No
## 8fabb679415fe6a9969015076873e211 26       No
## 7363aac88b5325ac49cb469e284e10dd  1       No
## 2dc2cf60102ff00cd077f2304ec84d06  0       No
## 04badf3377fec5aadb882276e3b3ca4f 11       No
## ebf61ede654f305596908efa9a8ddf54  4       No
## a3f9e91d15a189d0cf1e0596b9688957  8       No
## 5b526c732aa8f6b7ddbf6b35925d5011  0       No
## b180a2455b236c103cf0bfe620095736 17       No
## e1a2800b24cdf9779b28dc897cddb12a  4       No
## f2cd13764c9588bd921f3a53138830da  2       No
## a7eb050200d9b2408521cc7ae42327ba 18       No
## 4a4d11bb32e68079b4c71483bf0fcf36  1       No
## 6960eba3db7d4d863d042ab497d7481a  0       No
## a38b6324c57e2ef3c842180166aeb60f  0       No
## 172049e46605909f99651e4d0efb8578  2       No
## 6e13c195d4554dd8cf4923df9decd183  0       No
## d2a695aa271adf072749a729fd96a731  4       No
## 520eb0909e5157503fbed62cbb4e73ca 33       No
## 42de4e368cca3ff50954f82c12dd5315  2       No
## f966e124604e0e32b209b88df6e42cd4  1       No
## f7c08b0d8e574f7c089b95e0e4344d5e  0       No
## 1dda53416f231a3345668df39d4ae780  1       No
## 8c6b152535efbebf12179855cb31b8d8  0       No
## 8aa67fe08404e3d574aafb6622786c0f  4       No
## ec70d97b11476164cbd828e126ad10da  0       No
## 5c4ca852b40641b3eb0ad23e69bb6583  3       No
## ebe91a7d912f432d8726b0cef99d18c3  1       No
## 9a299c48d0e3ddea2122806528070d6e  1       No
## e4ae256bb51896c21795f743dc9ed9dd  0       No
## 7a297761643e32391ea7cd03b0995e62  1       No
## 73b4f14d16f02ee0ac82838166128de4  0       No
## e00f85cb3452e37943500e86afb268f4  2       No
# Step 3: Volcano Plot
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1))
names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6")

# Annotation data
dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.7"], label = "W[0.7]")

fig = res$fig +  
  geom_hline(yintercept = cut_off["detected_0.7"], linetype = "dashed") + 
  geom_text(data = dat_ann, aes(x = x, y = y, label = label), 
            size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE)
fig  

# ORGANIZE PLOT

TaxaData2 <- readRDS("../../data/processed/microbiome/phyloseq_inputs/merged-taxonomy-dada2-retrained.rds")
rownames(TaxaData2) <- TaxaData2$Feature.ID
## Warning: Setting row names on a tibble is deprecated.
ancomRes <- res$out

test <- merge(ancomRes, TaxaData2, by.x = "taxa_id", by.y = "Feature.ID", all.x = TRUE, all.y = FALSE)

# Make plot 
tempdata <- test[test$detected_0.6 == TRUE,] # 0.6 but can make 0.7
asv_id <- unique(tempdata$taxa_id)

MicroData4 <- feature_table[row.names(feature_table) %in% asv_id,]

# Rename to the taxonomy 
#MicroData5 <- as.data.frame(as.matrix(t(MicroData4)))
TaxaData3 <- TaxaData2 %>% dplyr::select(Order, Family, Genus)
# Make a unique naming column that incorporates multiple levels of taxonomy
TaxaData3$OrderFamilyGenus <- paste(TaxaData3$Order, TaxaData3$Family, TaxaData3$Genus)
rownames(TaxaData3) <- rownames(TaxaData2)
## Warning: Setting row names on a tibble is deprecated.
# Merge
MicroData5 <- merge(MicroData4, TaxaData3,
                    by=0, all.x = T, all.y = F)

row.names(MicroData5) <- MicroData5$OrderFamilyGenus

MicroData5 <- MicroData5 %>%  dplyr::select(-Order, -Family, -Genus, -OrderFamilyGenus, -Row.names)
MicroData5 <- as.data.frame(as.matrix(t(MicroData5)))

# Merge with the phenotype data 
MetaData3 <- MetaData2 %>% dplyr::select(tmao_mdn)

ggData <- merge(MetaData3, MicroData5, by =0)
row.names(ggData) <- ggData$Row.names
ggData <- ggData %>% dplyr::select(-Row.names)

# Gather the data
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
ggData2 <- gather(ggData,
                  key = "Taxa",
                  value = "Abundance",
                  -tmao_mdn)
                
ggData2$Abundance <- log(ggData2$Abundance)

# Plot it
p <- ggplot(data = ggData2,
            aes(x = Taxa, 
                y = Abundance,
                fill = tmao_mdn), 
            alpha = 0.1) +
  geom_boxplot(lwd=.25) +
  xlab(NULL) +
  ylab("Log of Taxa Abundance") +
  theme_bw() + # note order matters here - need bw to be before theme()
  theme(axis.text.y = element_text( hjust = 1),
        axis.text.x = element_text(angle = 60, hjust = 1),
        text=element_text(size=11))  +
  scale_fill_manual(values=c("#2D708EFF", "#B8DE29FF")) + 
  #scale_fill_viridis(discrete = TRUE) +
  #coord_flip() +
  labs(fill = "TMAO \n Classification")

p
## Warning: Removed 289 rows containing non-finite values (stat_boxplot).